This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

plot(rnorm(100))

plot(cars)

# Data Visualization
# ggplot2 "grammar of graphics"
#libraries need to be reloaded for every new session
library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages -------------------------------
filter(): dplyr, stats
lag():    dplyr, stats

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

# 3.2 
# do cars with large engines use more fuel than cars with small engines?
# what does the relationship between engine size and fuel efficiency look like?
# use mpg: a data frame containing observations collected by USEPA on cars
mpg

image of the mpg data frame

# plot of the mpg data using ggplot2
# displ, a car's engine size, is on the x-axis
# hwy, a car's highway efficiency, on the y-axis
#geom_point() adds a 'layer' to this graph and makes it a scattor plot
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

–plot of the data from ‘mpg’ , plotting a car’s engine sie on the x-axis, against its highway efficiency on the y-axis –looks like cars with largeer engines use more feul (duh)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = hwy, y = cyl))

# 3.3 Aesthetic Mappings 
# adding a 3rd var to a 2D scatter by adding a visual proerty to an object in your plot
# reproduction of the above plot with class added as a color
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color = class))

#class is an unordered var, so it may not be optimal to map it using size
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, size = class))

# alpha is transparency
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, alpha = class))

# shape has six shapes used
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, shape = class))

r rules

# the aesthetic properties may be manually set
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), color = "pink")

# the aesthetic properties may be manually set
#
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, size = cty))

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, color = cty))

These plots set a continuous variable (city mileage) to different aedthetic markers

# 3.5 Facets
# another method for multi variable data plotting
# var to facet wrap should be discrete
#example of 1 variable faceting
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_wrap(~class, nrow = 2)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(.~class)

A plot of highway mileage against display, faceted by class of vehicle, one variable

# example of 2-var faceting
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(drv~class)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(drv~cyl)

Plot of data faceted on two variables

# 3.6 Geometric Objects
# a 'geom' is the geometrical object that a plot uses to represent data
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) + 
  geom_smooth(mapping = aes(x = displ, y = hwy))

# these on the same plot
#note: pass the mappings to the ggplot function to avoid duplication errors
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth()

different geometric representations of the same data

#different aesthetics can be displayed in different layers
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth()

same plot as fig 14.c , with the scatter plot colored by class

# local data arguments will override globals 
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)

Same plot as fig 15 with line filtered to include only class subcompact

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
  geom_point() + 
  geom_smooth(se = FALSE)

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth(mapping = aes(group = drv))

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = drv)) + 
  geom_smooth(se = FALSE)

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = drv)) + 
  geom_smooth(mapping = aes(linetype = drv), se = FALSE)

# Statistical Transforms
#bar charts, histograms, frequency polygons bin and count data
#smoothers fit a model to your data, plot predictions come from this model 
# box plots compute a summary of the distribution of the data
#stat_plot can be used instead of geom_bar here
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))

chart of diamond count by cut quality

# use geom_bar to override the default stat
demo <- tribble(
  ~cut,         ~freq,
  "Fair",       1610,
  "Good",       4906,
  "Very Good",  12082,
  "Premium",    13791,
  "Ideal",      21551
)
ggplot(data = demo) +
  geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")

ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

fig a :plotted against frequency, not simply count fig b: statistics summary of cut against depth

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity))

ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar(alpha = 1/5, position = "identity")

ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) + 
  geom_bar(fill = NA, position = "identity")

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")

Colored Graphs,introduces a new var, organized by position adjustment

# jitter to view all data points with a small amount of noise, as view all of them
#some will be obscured
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")

Jittered vs. non-jittered

# 3.9 Coordinate systems
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot() +
  coord_flip()

flipping coordinate systems is useful for displaying horizantal boxplots

install.packages("maps")
Installing package into ‘/Users/sarahbarili/Library/R/3.3/library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/maps_3.1.1.tgz'
Content type 'application/x-gzip' length 3601273 bytes (3.4 MB)
==================================================
downloaded 3.4 MB

The downloaded binary packages are in
    /var/folders/ph/sb4qh05n1c33wyrwt1mbzz8c0000gp/T//Rtmpz04d4l/downloaded_packages
library(maps)

Attaching package: ‘maps’

The following object is masked from ‘package:purrr’:

    map
nz<- map_data("nz")
ggplot(nz, aes(long,lat, group = group)) +
  geom_polygon(fill = "pink", colour = "black")

#coord_quickmap sets the correct ratio for geospatial data
ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "pink", colour = "black") +
  coord_quickmap()

bar <- ggplot(data = diamonds) + 
  geom_bar(
    mapping = aes(x = cut, fill = cut), 
    show.legend = FALSE,
    width = 1
  ) + 
  theme(aspect.ratio = 1) +
  labs(x = NULL, y = NULL)
bar + coord_flip()

bar + coord_polar()

# Data Transformation
library(nycflights13)
library(tidyverse)
flights
# 5.2 filter
# select all flights on Jan 1st
jan1 <-filter(flights, month == 1, day == 1)
jan1
nov_dec <- filter(flights, month %in% c(11, 12))
nov_dec2 <- filter(flights, month == 11 | month == 12)
# 5.3 Arrange
arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))
# 5.6 Grouped Summaries
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
# explore the relationship betweem the distance and the ang. delay for each location
#group flights by destination
#summarize to compute distance, average delay, # of flights
#filter outliers and honolulu
#na.rm = TRUE removes missing values
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay,count >20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)

#using pipes to make variable naming and analysis easier
delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")
batting <- as_tibble(Lahman::Batting)
batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    ab = sum(AB, na.rm = TRUE)
  )
batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)

#> `geom_smooth()` using method = 'gam'
# 7 Exploratory Data Analysis
#Generate Questions about your data
#search for answers using visualization and transformation, and modeling
#use what you learn to refine your questions and generate new questions
# 7.3 Variation
# tendency of the values of a variable to change from measureent to measurement
# visualizing distributions: categporical or continuous 
# categorical -- use a bar chart
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))

diamonds %>%
  count(cut)

the hieght of the bars shows how many observations of the data occured at each x value

# use a histgram for continuous data
# binwidth organizes each value with set boundaries
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>% 
  count(cut_width(carat, 0.5))
#Different binwidths could reveal different realtionships in the data
smaller <- diamonds %>% 
  filter(carat < 3)
  
ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)

ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
  geom_freqpoly(binwidth = 0.1)

ggplot(data = faithful, mapping = aes(x = eruptions)) + 
  geom_histogram(binwidth = 0.1)

# 7.5 Covariation
# tencdency for two values to be related
# how does the price of a diamond vary with its quality?
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + 
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

ggplot(diamonds) + 
  geom_bar(mapping = aes(x = cut))

# how does highway mileage vary across classes of vehicles?
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

# Two categorical variables
ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color))

diamonds %>% 
  count(color, cut) %>%  
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n))

# two continuous variables
ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))

#binning for two continuous variables is helpful to visualize large data
ggplot(data = smaller) +
  geom_bin2d(mapping = aes(x = carat, y = price))

install.packages("hexbin")
Installing package into ‘/Users/sarahbarili/Library/R/3.3/library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/hexbin_1.27.1.tgz'
Content type 'application/x-gzip' length 773778 bytes (755 KB)
==================================================
downloaded 755 KB

The downloaded binary packages are in
    /var/folders/ph/sb4qh05n1c33wyrwt1mbzz8c0000gp/T//Rtmpz04d4l/downloaded_packages
ggplot(data = smaller) +
  geom_hex(mapping = aes(x = carat, y = price))

#another option is to bin one continuous variable so it behaves like a categorical variable
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + 
  geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))

# 7 Patterns and models
# build a model that predicts price from carat
#computes residuals
library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>% 
  add_residuals(mod) %>% 
  mutate(resid = exp(resid))
ggplot(data = diamonds2) + 
  geom_point(mapping = aes(x = carat, y = resid))

relative to size, better quality diamonds are more expensive

ggplot(data = diamonds2) + 
  geom_boxplot(mapping = aes(x = cut, y = resid))

# same code
ggplot(data = faithful, mapping = aes(x = eruptions)) + 
  geom_freqpoly(binwidth = 0.25)

ggplot(faithful, aes(eruptions)) + 
  geom_freqpoly(binwidth = 0.25)

#Data Wrangling
#getting your data into R 
#import, tidy, transform
# 10 Tibbles
# tibbles are data frames 
library(tidyverse)
#coerce a regualr data frame to a tibble
as_tibble(iris)
#create a 5X3 tibble
tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)
#tibbles will automatically print the first 10 rows of a tibble
#this can be customized
nycflights13::flights %>% 
  print(n = 10, width = Inf)
df <- tibble(
  x = runif(5),
  y = rnorm(5)
)
# Subsetting
# [[ ]] extracts by name or position
df[[1]]
[1] 0.4117549 0.8229977 0.2308909 0.5321475 0.3877842
df[["x"]]
[1] 0.4117549 0.8229977 0.2308909 0.5321475 0.3877842
# $ extracts by name wih less typing
df$x
[1] 0.4117549 0.8229977 0.2308909 0.5321475 0.3877842
#use in a pipe
df %>% .$x
[1] 0.4117549 0.8229977 0.2308909 0.5321475 0.3877842
df %>% .[["x"]]
[1] 0.4117549 0.8229977 0.2308909 0.5321475 0.3877842
# 23  Model Basics
library(tidyverse)
library(modelr)
options(na.action = na.warn)
# Define a family of models that express a precise, but generic pattern (i.e. a line, quadratic, etc)
# generate a fitted model by making the generic model more specific
ggplot(sim1, aes(x, y)) + 
  geom_point()

Simulated Plot with a strong pattern

#randomly generate a few models and overlay them on the data
models <- tibble(
  a1 = runif(250, -20, 40),
  a2 = runif(250, -5, 5)
)
ggplot(sim1, aes(x, y)) + 
  geom_abline(aes(intercept = a1, slope = a2), data = models, alpha = 1/4) +
  geom_point()

Our data with a bunch of shitty models

# quantify the distance between the data and a model
# generate model with the smallest distance from the data 
# Step 1: turn model family into an R function
model1 <- function(a, data) {
  a[1] + data$x * a[2]
}
model1(c(7, 1.5), sim1)
 [1]  8.5  8.5  8.5 10.0 10.0 10.0 11.5 11.5 11.5 13.0 13.0
[12] 13.0 14.5 14.5 14.5 16.0 16.0 16.0 17.5 17.5 17.5 19.0
[23] 19.0 19.0 20.5 20.5 20.5 22.0 22.0 22.0
measure_distance <- function(mod, data) {
  diff <- data$y - model1(mod, data)
  sqrt(mean(diff ^ 2))
}
# root-mean-squared deviation
measure_distance(c(7, 1.5), sim1)
[1] 2.665212
#use purrr function to compute the distance for all bove defined models
sim1_dist <- function(a1, a2) {
  measure_distance(c(a1, a2), sim1)
}
models <- models %>% 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
models
# overlay the 10 best models onto the the data
ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(models, rank(dist) <= 10)
  )

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(models, rank(dist) <= 1)
  )

ggplot(models, aes(a1, a2)) +
  geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist))

#grid search for best model
grid <- expand.grid(
  a1 = seq(-5, 20, length = 25),
  a2 = seq(1, 3, length = 25)
  ) %>% 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
grid %>% 
  ggplot(aes(a1, a2)) +
  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist)) 

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(grid, rank(dist) <= 10)
  )

---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 
```{r}
plot(rnorm(100))
```

```{r}
plot(cars)
```
```{r}
# Data Visualization
# ggplot2 "grammar of graphics"
#libraries need to be reloaded for every new session
library(tidyverse)
```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
```{r}
# 3.2 
# do cars with large engines use more fuel than cars with small engines?
# what does the relationship between engine size and fuel efficiency look like?
# use mpg: a data frame containing observations collected by USEPA on cars
mpg
```
image of the mpg data frame
```{r}
# plot of the mpg data using ggplot2
# displ, a car's engine size, is on the x-axis
# hwy, a car's highway efficiency, on the y-axis
#geom_point() adds a 'layer' to this graph and makes it a scattor plot
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))
```
--plot of the data from 'mpg' , plotting a car's engine sie on the x-axis, against its highway efficiency on the y-axis
--looks like cars with largeer engines use more feul (duh)
```{r}
ggplot(data = mpg) +
  geom_point(mapping = aes(x = hwy, y = cyl))
```
```{r}
# 3.3 Aesthetic Mappings 
# adding a 3rd var to a 2D scatter by adding a visual proerty to an object in your plot
# reproduction of the above plot with class added as a color
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color = class))
```
```{r}
#class is an unordered var, so it may not be optimal to map it using size
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, size = class))
```

```{r}
# alpha is transparency
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, alpha = class))

# shape has six shapes used
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, shape = class))
```
r rules
```{r}
# the aesthetic properties may be manually set
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), color = "pink")
```
```{r}
# the aesthetic properties may be manually set
#
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, size = cty))

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, color = cty))
```
These plots set a continuous variable (city mileage) to different aedthetic markers
```{r}
# 3.5 Facets
# another method for multi variable data plotting
# var to facet wrap should be discrete

#example of 1 variable faceting
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_wrap(~class, nrow = 2)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(.~class)
```
A plot of highway mileage against display, faceted by class of vehicle, one variable
```{r}
# example of 2-var faceting
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(drv~class)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(drv~cyl)

```
Plot of data faceted on two variables
```{r}
# 3.6 Geometric Objects
# a 'geom' is the geometrical object that a plot uses to represent data
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) + 
  geom_smooth(mapping = aes(x = displ, y = hwy))

# these on the same plot
#note: pass the mappings to the ggplot function to avoid duplication errors

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth()
```
different geometric representations of the same data
```{r}
#different aesthetics can be displayed in different layers
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth()
```
same plot as fig 14.c , with the scatter plot colored by class
```{r}
# local data arguments will override globals 
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
```
Same plot as fig 15 with line filtered to include only class subcompact
```{r}
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
  geom_point() + 
  geom_smooth(se = FALSE)
```
```{r}
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth(mapping = aes(group = drv))

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = drv)) + 
  geom_smooth(se = FALSE)

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = drv)) + 
  geom_smooth(mapping = aes(linetype = drv), se = FALSE)
```
```{r}
# Statistical Transforms
#bar charts, histograms, frequency polygons bin and count data
#smoothers fit a model to your data, plot predictions come from this model 
# box plots compute a summary of the distribution of the data

#stat_plot can be used instead of geom_bar here
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))
```
chart of diamond count by cut quality
```{r}
# use geom_bar to override the default stat
demo <- tribble(
  ~cut,         ~freq,
  "Fair",       1610,
  "Good",       4906,
  "Very Good",  12082,
  "Premium",    13791,
  "Ideal",      21551
)

ggplot(data = demo) +
  geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")

ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )
```
fig a :plotted against frequency, not simply count
fig b: statistics summary of cut against depth
```{r}
ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity))
ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar(alpha = 1/5, position = "identity")
ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) + 
  geom_bar(fill = NA, position = "identity")

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")
ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")
```
Colored Graphs,introduces a new var, organized by position adjustment
```{r}
# jitter to view all data points with a small amount of noise, as view all of them
#some will be obscured
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")
```
Jittered vs. non-jittered
```{r}
# 3.9 Coordinate systems
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot() +
  coord_flip()
```
flipping coordinate systems is useful for displaying horizantal boxplots 
```{r}
install.packages("maps")
library(maps)
nz<- map_data("nz")

ggplot(nz, aes(long,lat, group = group)) +
  geom_polygon(fill = "pink", colour = "black")
#coord_quickmap sets the correct ratio for geospatial data
ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "pink", colour = "black") +
  coord_quickmap()
```
```{r}
bar <- ggplot(data = diamonds) + 
  geom_bar(
    mapping = aes(x = cut, fill = cut), 
    show.legend = FALSE,
    width = 1
  ) + 
  theme(aspect.ratio = 1) +
  labs(x = NULL, y = NULL)

bar + coord_flip()
bar + coord_polar()
```
```{r}
# Data Transformation
library(nycflights13)
library(tidyverse)
```

```{r}
flights
```

```{r}
# 5.2 filter
# select all flights on Jan 1st
jan1 <-filter(flights, month == 1, day == 1)
jan1
```
```{r}
nov_dec <- filter(flights, month %in% c(11, 12))
nov_dec2 <- filter(flights, month == 11 | month == 12)
```

```{r}
# 5.3 Arrange
arrange(flights, year, month, day)
```
```{r}
arrange(flights, desc(arr_delay))
```

```{r}
# 5.6 Grouped Summaries
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
```

```{r}
# explore the relationship betweem the distance and the ang. delay for each location
#group flights by destination
#summarize to compute distance, average delay, # of flights
#filter outliers and honolulu
#na.rm = TRUE removes missing values
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
)

delay <- filter(delay,count >20, dest != "HNL")

ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)


#using pipes to make variable naming and analysis easier
delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")
```
```{r}
batting <- as_tibble(Lahman::Batting)

batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    ab = sum(AB, na.rm = TRUE)
  )

batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)
#> `geom_smooth()` using method = 'gam'
```
```{r}
# 7 Exploratory Data Analysis
#Generate Questions about your data
#search for answers using visualization and transformation, and modeling
#use what you learn to refine your questions and generate new questions

# 7.3 Variation
# tendency of the values of a variable to change from measureent to measurement
# visualizing distributions: categporical or continuous 
# categorical -- use a bar chart
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))
diamonds %>%
  count(cut)
```
the hieght of the bars shows how many observations of the data occured at each x value
```{r}
# use a histgram for continuous data
# binwidth organizes each value with set boundaries
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>% 
  count(cut_width(carat, 0.5))
#Different binwidths could reveal different realtionships in the data
smaller <- diamonds %>% 
  filter(carat < 3)
  
ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)
```

```{r}
ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
  geom_freqpoly(binwidth = 0.1)
```
```{r}
ggplot(data = faithful, mapping = aes(x = eruptions)) + 
  geom_histogram(binwidth = 0.1)
```

```{r}
# 7.5 Covariation
# tencdency for two values to be related
# how does the price of a diamond vary with its quality?

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + 
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

ggplot(diamonds) + 
  geom_bar(mapping = aes(x = cut))
```
```{r}
# how does highway mileage vary across classes of vehicles?
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
```

```{r}
# Two categorical variables
ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color))
```

```{r}
diamonds %>% 
  count(color, cut) %>%  
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n))
```
```{r}
# two continuous variables
ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))
```

```{r}
#binning for two continuous variables is helpful to visualize large data

ggplot(data = smaller) +
  geom_bin2d(mapping = aes(x = carat, y = price))

install.packages("hexbin")
ggplot(data = smaller) +
  geom_hex(mapping = aes(x = carat, y = price))
```

```{r}
#another option is to bin one continuous variable so it behaves like a categorical variable
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + 
  geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
```

```{r}

# 7 Patterns and models
# build a model that predicts price from carat
#computes residuals
library(modelr)

mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>% 
  add_residuals(mod) %>% 
  mutate(resid = exp(resid))

ggplot(data = diamonds2) + 
  geom_point(mapping = aes(x = carat, y = resid))

```
relative to size, better quality diamonds are more expensive
```{r}
ggplot(data = diamonds2) + 
  geom_boxplot(mapping = aes(x = cut, y = resid))
```

```{r}

# same code

ggplot(data = faithful, mapping = aes(x = eruptions)) + 
  geom_freqpoly(binwidth = 0.25)
ggplot(faithful, aes(eruptions)) + 
  geom_freqpoly(binwidth = 0.25)
```
```{r}
#Data Wrangling
#getting your data into R 
#import, tidy, transform
# 10 Tibbles
# tibbles are data frames 
library(tidyverse)
```

```{r}
#coerce a regualr data frame to a tibble
as_tibble(iris)

```

```{r}
#create a 5X3 tibble
tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
```

```{r}


tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)
```

```{r}
#tibbles will automatically print the first 10 rows of a tibble
#this can be customized
nycflights13::flights %>% 
  print(n = 10, width = Inf)
```
```{r}
df <- tibble(
  x = runif(5),
  y = rnorm(5)
)
# Subsetting
# [[ ]] extracts by name or position
df[[1]]
df[["x"]]
# $ extracts by name wih less typing
df$x

#use in a pipe
df %>% .$x
df %>% .[["x"]]

```

```{r}
# 23  Model Basics
library(tidyverse)
library(modelr)
options(na.action = na.warn)
# Define a family of models that express a precise, but generic pattern (i.e. a line, quadratic, etc)
# generate a fitted model by making the generic model more specific

ggplot(sim1, aes(x, y)) + 
  geom_point()
```
Simulated Plot with a strong pattern
```{r}
#randomly generate a few models and overlay them on the data
models <- tibble(
  a1 = runif(250, -20, 40),
  a2 = runif(250, -5, 5)
)
ggplot(sim1, aes(x, y)) + 
  geom_abline(aes(intercept = a1, slope = a2), data = models, alpha = 1/4) +
  geom_point()
```
Our data with a bunch of shitty models
```{r}
# quantify the distance between the data and a model
# generate model with the smallest distance from the data 
# Step 1: turn model family into an R function
model1 <- function(a, data) {
  a[1] + data$x * a[2]
}
model1(c(7, 1.5), sim1)
measure_distance <- function(mod, data) {
  diff <- data$y - model1(mod, data)
  sqrt(mean(diff ^ 2))
}



```

```{r}
# root-mean-squared deviation
measure_distance(c(7, 1.5), sim1)
```
```{r}
#use purrr function to compute the distance for all bove defined models
sim1_dist <- function(a1, a2) {
  measure_distance(c(a1, a2), sim1)
}

models <- models %>% 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
models
```
```{r}
# overlay the 10 best models onto the the data
ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(models, rank(dist) <= 10)
  )

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(models, rank(dist) <= 1)
  )
```
```{r}
ggplot(models, aes(a1, a2)) +
  geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist))
```

```{r}
#grid search for best model
grid <- expand.grid(
  a1 = seq(-5, 20, length = 25),
  a2 = seq(1, 3, length = 25)
  ) %>% 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))

grid %>% 
  ggplot(aes(a1, a2)) +
  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist)) 
```
```{r}
ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, colour = -dist), 
    data = filter(grid, rank(dist) <= 10)
  )
```
```{r}

```

